org 100h ; assume cs<=0x2a18 [0xfd..ff]=0

T1 equ 27*4
T2 equ 19*4
T3 equ -32*4
T4 equ 23*4

C equ $+8

; ds is moved: all  constant access is [ss:bp+?]
%define w(xx) word[byte bp+si-0x100+xx]
%define d(xx) dword[byte bp+si-0x100+xx]

  push 0xa000   ;<-[bp+si] = scratch variable
; the 4 most significant bytes of qword[bp+di] will be SCALE
  lds bp,[si-3] ; bp=0, ds=0x6800  table: cos
  pop es              ; es=0xa000  screen
  mov ax,0x4f02
  mov fs,ax           ; fs=0x4f02  table: color_mul/cos
  mov bx,0x10e
  int 10h    ; 320x200 with 65536 colors; assume it's ok (ax=0x004f)

  add ax,0x39c9 ; 10 05 c9 39, should be "db 0f c9 39"
TABLE_STEP equ $-4 ;= 0.000383495197 = 2pi / 16384, also ~ 1 / (256 * pi^2)
  mov gs,ax           ; gs=0x3a18  backbuffer
;  fninit

; Cos table with 16384 entries
COS_TAB:
  imul bx,[bp+di],4 ; bx=[ss:bp+di]=[ss:-2]=angle (0 on init)
  fild word[bp+di]
  fmul d(TABLE_STEP)
  fcos           ;; cos(angle/65536*2pi): adjust period to 2pi
  fst dword[bx]

  fldpi           ; color_mul = 3.142
  fdivrp st1,st0  ; color_mul / cos(...)
;  fdivr d(COLMUL)  ; color_mul / cos(...)
  fstp dword[fs:bx]
  inc word[bp+di] ; next angle
  jnz COS_TAB     ; bx=4

; Frame loop
M: ; bp=0 cx=timer

; Precompute the scale.
  imul bx,cx,T2
  fld d(ZOOM)
  fsub dword[bx]        ;; zoom-cos(t2)
  fdiv d(ZOOM)          ;; (zoom-cos(t2))/zoom
  fstp qword[bp+si]     ;; scale = 1 - cos(t2)/zoom
  inc cx
ZOOM equ $-4  ; =9.679

; Pixel loop
X mov ax,0xcccd ; convert width 320 -> 65536
  mul di
  xchg ax,bx    ; full 16-bit precision of X
  mov ax,0x4f05
  add bx,ax
  adc dx,0x9b80 ; center at [100, 159.5]: should be 0x9b804d46

  pusha ; [-18-16-14-12-10 -8 -6 -4] on the stack
        ;   di si bp sp bx dx cx ax
        ;                  yy
        ;                x x

  add di,di
  jnz D
  cwd            ; twice per frame: set window, assume 64kB granularity
  adc dx,dx
  xor bx,bx      ; bh=0 bl=window=0 dx=page(0 or 1)
  int 10h

D mov ax,[gs:di]
  jc COPY_MIRROR  ; compute only the top half of the screen

  push di
  call IT
  pop di

  sub bp,di
  mov [gs:bp+320*200*2-65536 - 2],ax
COPY_MIRROR:
  stosw
;  stosw   ; 2x faster, put 4 ^^^ here
  popa
;  inc di  ; 2x faster
  inc di
  jnz X  ; di=0

;  call SCREENSHOT

  in al,60h ; ESC check
  cmp al,1
  jne M     ; exit later

IT:
  ; [-18-16-14-12-10 -8 -6 -4]
  ;   di si bp sp bx dx cx ax
  ;                  yy
  ;                x x

Z fldz
  inc bp
  jpo Z   ; loop 3x, bp=3  ;; R=0 G=0 B=0

  dec bp
L dec bp            ; bp:1,0 -> [-8],[-9] -> y,x
  fild word[bp-9]
  fadd st0          ;; x[-65536..65536] y R G B
;  fldpi
;  fimul word[bp-9]
  jpo L  ; loop 2x, bp=0 again, zero flag = 1

  jmp LEN
LEN_RET:
  imul di,[bp+si],4 ; di = d = 65536/2pi * length(x,y)/2

  imul dx,cx,T4
  sub dx,di         ; dx = t4-d
  imul bx,cx,T1     ; bx = t1, will be pushed
  imul cx,T3
  add di,cx         ; di = d-t3

  mov ax,0x8000 + 10 ; al = number of iterations, ~0x8000 fold offset

  ;cl: horrible parity hack
  ; 25 00100101 o <- start
  ; 26 00100110 o        <- after Q
  ; 27 00100111 e   <- after R
  ; 28 00101000 e
  ; 29 00101001 o     <- after F

; rotate
; [x] = [C -S] * [x]
; [y]   [S  C]   [y]
I mov cl,0x25    ; RGB phase shift, later shift length

R fld st1          ;; y x y R G B    | x Sy x Cy R G B
  fmul dword[bx]   ;; Cy x y R G B   | Cx Sy x Cy R G B
  fxch st2         ;; y x Cy R G B  | x Sy Cx Cy R G B
  fmul dword[bx-0x4000]
  inc cx           ;; Sy x Cy R G B  | Sx Sy Cx Cy R G B
  jpo R ; loop 2x: cl=0x27
  faddp st3,st0  ;; Sy Cx Sx+Cy R G B
  fsubp st1,st0  ;; x=Cx-Sy y=Sx+Cy R G B

; scale, square fold
F fmul qword[bp+si]  ; scale
  fistp dword[bp+si] ; wrap: keep only bottom 16 bits
  add word[bp+si],ax ;~0x8000, can also be xor (cf=0)
  fild word[bp+si] ;; x = x-round(x) | y = y-round(y)
  fxch st1
  inc cx
  jpe F ; loop 2x: cl=0x29, zero flag = 0

; interfering concentric circles

; subroutine: compute length of 2D vector, scale to access cos table
LEN: ;; x y -> [bp+si] = sqrt(x*x+y*y)/65536/2 * 16384/2pi = sqrt(x*x+y*y)*C = sqrt(C^2*(x*x+y*y))
  fld st1
  fmul st0
  fld st1   ; -1 byte: cmc, jc LEN (but it's slow in DOSBox)
  fmul st0
  faddp
  fmul d(TABLE_STEP)  ; exact: (16384/2pi/65536/2)^2 = 0.000395785+
  fsqrt
  fistp word[bp+si]
  jz LEN_RET

  push bx

; k = color_mul / cos(5*length(x,y) + d - t3)
; [R G B] += k * ( 1 + 2 * cos(3*(i/40 + t4-d) + [1.8 0.9 0]) );
  dec dh
  imul bx,dx,3      ; bx = q = 65536/2pi * 3*(i/40 + t4-d)
Q fld1
  fadd dword[bx]
  fadd dword[bx]    ;; 1+2cos(q) x y R G B
  sub bh,cl         ; q += ~ 0.9 * 65536/2pi
  dec cx ; cl=0x26
  jpe Q  ; loop 3x ;; [dR dG dB]=1+2cos(q+[1.8 0.9 0])) x y R G B  ; bp=3

  imul bx,[bp+si],10*4 ; 65536/2pi * 5*length(x,y)
K fmul dword[fs:bx+di] ; k = color_mul / cos(5*length(x,y) + d - t3)
  faddp st5,st0        ;; dG gB x y R+=k*dR G B
  add al,0x55  ; +55 +aa +ff(=-1),carry
  jnc K ; loop 3x     ;; x y R+=k*dR G+=k*dG B+=k*dB

  pop bx

  jnz I ; al=0, cl=0x26

  fcompp            ;; R G B

; Assemble R,G,B into 16-bit high color (5+6+5 bits). Clamp to 0..0xffff.
  inc ax  ; 0x8001
A fmul st0          ;; R^2 G^2 B^2, want 0..0xffff
  fistp word[bp+si] ; if it's > 0x7fff, clamp to 0x8000
  imul bx,[bp+si],2 ; double, set carry if it was > 0x3fff
  sbb bx,bp         ; overflow -> 0xffff
  xor cl,5^6        ; cl & 0x1f = shift length: flip 5<->6
  shld ax,bx,cl ; rrrrrggggggbbbbb
  jnc A ; loop 3x until you shift the 1 bit out of ax

  ret

;%include "screenshot320_16.inc"
